# Libraries
library(tidyverse)
library(lubridate)
library(waffle)
library(kableExtra)
library(ggthemes)
library(ggalluvial)

# Parameters
input_path <- here::here("data", "mspa_na_data.rds")
metadata_path <- here::here("data-raw", "school_metadata.csv")
figure_out_path <- file.path("~", "Desktop", "jgme_final", "Figures")

total_md <- 153
total_do <- 36

likert_colors <- c("darkgreen","green","orange","red","darkred")

likert_labels <- 
  c(
    "Strongly disagree",
    "Somewhat disagree", 
    "Neither agree nor disagree", 
    "Somewhat agree", 
    "Strongly agree"
  )

interest_labels <- 
  c(
    "Not at all interested",
    "Less interested", 
    "Undecided", 
    "Somewhat interested", 
    "Very interested"
  )

my_theme <- 
  theme(
    axis.text.y = element_text(size = 9),
    axis.text.x = element_text(size = 9, angle = 45, hjust = 1),
    axis.title.y = element_text(size = 12, face = "bold"), 
    axis.title.x = element_text(size = 12, face = "bold")
  ) 

mspa_hex <- "#666699"

#===============================================================================

# read in data
na_data <- 
  input_path %>% 
  read_rds()

metadata <- 
  metadata_path %>% 
  read_csv() %>% 
  drop_na()

Data quality and cleaning

I’ve already done some data cleaning in previous scripts, but there are still a few tweaks that we probably want to make so that everything is human-readable and so that only useful information is carried forward.

First, we want to remove people who did not consent to the survey, since we can’t use their information anyway. We will also remove students who are not in schools within the U.S. or who did not provide the name of the school that they attend.

na_data <- 
  na_data %>% 
  filter(consent == "yes", school_attend != "International") %>% 
  drop_na(school_attend)

And then we can filter out some variables that we don’t need.

na_data <- 
  na_data %>% 
  select(-survey_id, -consent, -complete)

And create a few variables that we’ll need later:

na_data <- 
  na_data %>% 
  mutate(
    sex = 
      case_when(
        sab_is_male == "yes"   ~ "male", 
        sab_is_female == "yes" ~ "female", 
        TRUE                   ~ NA_character_
      ), 
    is_urm = if_else(race_white == "yes", "no", "yes"), 
    is_gm = 
      case_when(
        gender_agender == "yes" ~ "yes", 
        gender_genderqueer == "yes" ~ "yes", 
        gender_transman == "yes" ~ "yes", 
        gender_transwoman == "yes" ~ "yes", 
        gender_another == "yes" ~ "yes", 
        TRUE ~ "no"
      )
  ) %>% 
  left_join(metadata, by = c("school_attend" = "name")) %>% 
  mutate(degree_type = factor(Degree, levels = c("MD", "DO"))) %>% 
  drop_na(degree_type)

na_data
## # A tibble: 1,162 x 116
##    participant_id timestamp school_attend med_school_year med_school_year…
##             <dbl> <chr>     <chr>         <chr>           <chr>           
##  1              1 2018-08-… Stanford Uni… Pre-Clinical S… <NA>            
##  2              2 2018-08-… Stanford Uni… Research (PhD,… <NA>            
##  3              3 2018-08-… Stanford Uni… Research (PhD,… <NA>            
##  4              4 2018-08-… Johns Hopkin… Pre-Clinical S… <NA>            
##  5              6 2018-08-… Stanford Uni… Pre-Clinical S… <NA>            
##  6              8 2018-08-… Stanford Uni… Research (PhD,… <NA>            
##  7              9 2018-08-… Stanford Uni… Clinical Stude… <NA>            
##  8             10 2018-08-… Stanford Uni… Pre-Clinical S… <NA>            
##  9             11 2018-08-… Stanford Uni… Pre-Clinical S… <NA>            
## 10             12 2018-08-… Stanford Uni… Pre-Clinical S… <NA>            
## # … with 1,152 more rows, and 111 more variables: is_lgbtq <chr>,
## #   sab_is_male <chr>, sab_is_female <chr>, gender_man <chr>,
## #   gender_woman <chr>, gender_agender <chr>, gender_genderqueer <chr>,
## #   gender_transman <chr>, gender_transwoman <chr>, gender_another <chr>,
## #   gender_another_description <chr>, so_asexual <chr>, so_bisexual <chr>,
## #   so_gay <chr>, so_lesbian <chr>, so_pansexual <chr>, so_queer <chr>,
## #   so_questioning <chr>, so_heterosexual <chr>, so_another <chr>,
## #   so_another_description <chr>, race_native <chr>, race_asian <chr>,
## #   race_black <chr>, race_pi <chr>, race_white <chr>, race_hispanic <chr>,
## #   race_another <chr>, race_another_explanation <chr>,
## #   interaction_agree <dbl>, interaction_satisfaction <dbl>,
## #   personal_benefit_mspa <dbl>, community_benefit_mspa <dbl>,
## #   enhanced_activity_mspa_lgbtq_meded <chr>,
## #   enhanced_activity_mspa_social <chr>,
## #   enhanced_activity_mspa_di_training <chr>,
## #   enhanced_activity_mspa_discrim_bias_reduction <chr>,
## #   enhanced_activity_mspa_mentorship <chr>,
## #   enhanced_activity_mspa_advocacy <chr>,
## #   enhanced_activity_mspa_global_health <chr>,
## #   enhanced_activity_mspa_other <chr>,
## #   enhanced_activity_mspa_other_explanation <chr>,
## #   school_affinity_group_exist <chr>, school_affinity_group_benefit <dbl>,
## #   school_affinity_group_involved <chr>, why_not_involved_time <chr>,
## #   why_not_involved_value <chr>, why_not_involved_opportunities <chr>,
## #   why_not_involved_uninterested <chr>, why_not_involved_not_queer <chr>,
## #   why_not_involved_another <chr>, why_not_involved_another_explanation <chr>,
## #   school_activities_advocacy <chr>, school_activities_social <chr>,
## #   school_activities_mentorship <chr>, school_activities_educational <chr>,
## #   school_activities_research <chr>, school_activities_intercollegiate <chr>,
## #   school_activities_other <chr>, school_activities_other_explanation <chr>,
## #   school_affinity_group_mission <dbl>, school_affinity_group_supported <dbl>,
## #   school_affinity_group_identify <dbl>, interest_lgbtq_meded <dbl>,
## #   interest_lgbtq_social <dbl>, interest_lgbtq_bias_training <dbl>,
## #   interest_lgbtq_advocacy <dbl>, interest_lgbtq_global_health <dbl>,
## #   interest_lgbtq_other <chr>, importance_lgbtq_meded <chr>,
## #   importance_lgbtq_social <chr>, importance_lgbtq_bias_training <chr>,
## #   importance_lgbtq_mentorship <chr>, importance_lgbtq_advocacy <chr>,
## #   importance_lgbtq_global_health <chr>, satisfaction_lgbtq_meded <dbl>,
## #   satisfaction_lgbtq_social <dbl>, satisfaction_bias_training <dbl>,
## #   satisfaction_lgbtq_mentorship <dbl>, satisfaction_lgbtq_advocacy <dbl>,
## #   satisfaction_lgbtq_global_health <dbl>, out_classmates_peers <chr>,
## #   out_labmates_coworkers_team <chr>, out_mentors <chr>,
## #   out_medical_school_app <chr>, out_residency_app <chr>, out_other <chr>,
## #   out_other_explanation <chr>, ability_out_classmates_peers <chr>,
## #   ability_out_labmates_coworkers_team <chr>, ability_out_mentors <chr>,
## #   ability_out_medical_school_app <chr>, ability_out_residency_app <chr>,
## #   ability_out_other <chr>, ability_out_other_explanation <chr>,
## #   protections_out_classmates_peers <chr>,
## #   protections_out_labmates_coworkers_team <chr>,
## #   protections_out_mentors <chr>, protections_out_medical_school_app <chr>,
## #   protections_out_residency_app <chr>, …

Univariate Demographics report

The first section of the survey asked questions about respondents’ demographic identifiers, including the following:

Below, we calculate the breakdown of our survey respondents by each of these demographics.

What school do you attend?

Schools are arranged in order of decreasing number of responses.

na_data %>%
  count(school_attend, name = "Number of responses") %>% 
  mutate(
    `Percentage of total responses` = 
      ((`Number of responses` / sum(`Number of responses`)) * 100) %>% 
      round(digits = 1)
  ) %>% 
  rename(School = school_attend) %>% 
  arrange(desc(`Number of responses`)) %>% 
  knitr::kable()
School Number of responses Percentage of total responses
Temple University School of Medicine 84 7.2
University of Louisville School of Medicine 62 5.3
University of Oklahoma College of Medicine 57 4.9
Geisinger Commonwealth School of Medicine 44 3.8
Stanford University School of Medicine 40 3.4
University of Pittsburgh School of Medicine 39 3.4
University of New Mexico School of Medicine 37 3.2
Saint Louis University School of Medicine 34 2.9
University of Alabama School of Medicine 34 2.9
University of Arkansas for Medical Sciences/UAMS College of Medicine 34 2.9
Johns Hopkins University School of Medicine 32 2.8
University of Michigan Medical School 32 2.8
Western Michigan University Homer Stryker M.D. School of Medicine 30 2.6
Dell Medical School at The University of Texas at Austin 28 2.4
Chicago Medical School of Rosalind Franklin University of Medicine and Science 26 2.2
Touro University California College of Osteopathic Medicine 23 2.0
University of Wisconsin School of Medicine and Public Health 23 2.0
Georgetown University School of Medicine 22 1.9
Tulane University School of Medicine 22 1.9
Jacobs School of Medicine and Biomedical Sciences, University at Buffalo 21 1.8
Rutgers New Jersey Medical School 18 1.5
University of Texas Southwestern Medical School at Dallas 17 1.5
Harvard Medical School 15 1.3
Columbia University Roy and Diana Vagelos College of Physicians and Surgeons 14 1.2
Washington University School of Medicine 14 1.2
Weill Cornell Medical College 14 1.2
Albert Einstein College of Medicine 13 1.1
Alpert Medical School at Brown University 13 1.1
University of California, Irvine School of Medicine 13 1.1
University of Texas School of Medicine at San Antonio 11 0.9
University of Vermont College of Medicine 11 0.9
Cooper Medical School of Rowan University 10 0.9
University of Iowa Roy J. and Lucille A. Carver College of Medicine 9 0.8
University of Kansas School of Medicine 9 0.8
Vanderbilt University School of Medicine 9 0.8
West Virginia University School of Medicine 9 0.8
Edward Via College of Osteopathic Medicine 8 0.7
Keck School of Medicine of University of Southern California 8 0.7
University of North Texas Health Science Center Texas College of Osteopathic Medicine 8 0.7
Baylor College of Medicine 7 0.6
New York University School of Medicine 7 0.6
Philadelphia College of Osteopathic Medicine - Georgia Campus 7 0.6
State University of New York Downstate Medical Center College of Medicine 7 0.6
University of South Alabama College of Medicine 7 0.6
Michigan State University College of Human Medicine 6 0.5
University of California, San Fransisco School of Medicine 6 0.5
Yale School of Medicine 6 0.5
David Geffen School of Medicine at UCLA 5 0.4
Northwestern University Feinberg School of Medicine 5 0.4
University of Arizona College of Medicine - Phoenix 5 0.4
University of California, Davis School of Medicine 5 0.4
University of Florida College of Medicine 5 0.4
Lake Erie College of Osteopathic Medicine 4 0.3
Meharry Medical College School of Medicine 4 0.3
Pennsylvania State University College of Medicine 4 0.3
Perelman School of Medicine at the University of Pennsylvania 4 0.3
Philadelphia College of Osteopathic Medicine 4 0.3
Touro College of Osteopathic Medicine 4 0.3
University of Utah School of Medicine 4 0.3
University of Washington School of Medicine 4 0.3
Des Moines University College of Osteopathic Medicine 3 0.3
Indiana University School of Medicine 3 0.3
Medical College of Wisconsin 3 0.3
Oklahoma State University Center for Health Sciences College of Osteopathic Medicine 3 0.3
Sidney Kimmel Medical College at Thomas Jefferson University 3 0.3
Stony Brook University School of Medicine 3 0.3
Tufts University School of Medicine 3 0.3
University of Chicago Pritzker School of Medicine 3 0.3
University of Texas Medical School at Houston 3 0.3
A. T. Still University Kirksville College of Osteopathic Medicine 2 0.2
Dartmouth College Geisel School of Medicine 2 0.2
Edward Via College of Osteopathic Medicine- Carolinas Campus 2 0.2
Emory University School of Medicine 2 0.2
Hackensack Meridian School of Medicine 2 0.2
Icahn School of Medicine at Mount Sinai 2 0.2
Loma Linda University School of Medicine 2 0.2
Ohio University Heritage College of Osteopathic Medicine 2 0.2
Oregon Health & Science University School of Medicine 2 0.2
Rowan University School of Osteopathic Medicine 2 0.2
Rutgers Robert Wood Johnson Medical School 2 0.2
Sanford School of Medicine of the University of South Dakota 2 0.2
Texas Tech University Health Sciences Center School of Medicine 2 0.2
Touro University Nevada College of Osteopathic Medicine 2 0.2
University of Illinois at Urbana-Champaign Carle Illinois College of Medicine 2 0.2
University of Illinois College of Medicine 2 0.2
University of Massachusetts Medical School 2 0.2
University of Nebraska College of Medicine 2 0.2
University of Rochester School of Medicine and Dentistry 2 0.2
University of South Carolina School of Medicine 2 0.2
University of Toledo College of Medicine 2 0.2
Wayne State University School of Medicine 2 0.2
Boonshoft School of Medicine at Wright State University 1 0.1
Boston University School of Medicine 1 0.1
Burrell College of Osteopathic Medicine at New Mexico State University 1 0.1
Campbell University School of Osteopathic Medicine 1 0.1
Central Michigan University College of Medicine 1 0.1
Charles R. Drew University of Medicine and Science 1 0.1
Creighton University School of Medicine 1 0.1
Donald and Barbara Zucker School of Medicine at Hofstra/Northwell 1 0.1
Drexel University College of Medicine 1 0.1
Duke University School of Medicine 1 0.1
Eastern Virginia Medical School 1 0.1
Florida Atlantic University Charles E. Schmidt College of Medicine 1 0.1
Florida State University College of Medicine 1 0.1
Frank H. Netter M.D. School of Medicine at Quinnipiac University 1 0.1
Marian University College of Osteopathic Medicine 1 0.1
Mayo Clinic College of Medicine 1 0.1
Michigan State University College of Osteopathic Medicine 1 0.1
Nova Southeastern University College of Osteopathic Medicine 1 0.1
Oakland University William Beaumont School of Medicine 1 0.1
Rush Medical College 1 0.1
San Juan Bautista School of Medicine 1 0.1
Southern Illinois University School of Medicine 1 0.1
State University of New York Upstate Medical University 1 0.1
The Ohio State University College of Medicine 1 0.1
University of Colorado School of Medicine 1 0.1
University of Kentucky College of Medicine 1 0.1
University of Maryland School of Medicine 1 0.1
University of Minnesota Medical School 1 0.1
University of Nevada, Las Vegas School of Medicine 1 0.1
University of North Carolina School of Medicine 1 0.1
University of Texas Medical Branch School of Medicine 1 0.1
University of Virginia School of Medicine 1 0.1
Washington State University Elson S. Floyd College of Medicine 1 0.1
West Virginia School of Osteopathic Medicine 1 0.1

And just to explicitly show the overall N after filtering:

na_data %>% 
  count(name = "n_overall")
## # A tibble: 1 x 1
##   n_overall
##       <int>
## 1      1162

All other demographic variables

demo_overall <- 
  na_data %>% 
  count(degree_type, name = "count") %>% 
  mutate(percentage = (count / sum(count) * 100) %>% round(1)) %>% 
  bind_rows(
    na_data %>% 
      count(name = "count") %>% 
      mutate(
        percentage = 100,
        degree_type = "Full Sample"
      )
  ) %>% 
  pivot_longer(
    cols = c(count, percentage), 
    names_to = "variable", 
    values_to = "values"
  ) %>% 
  pivot_wider(
    names_from = c(degree_type, variable), 
    values_from = values
  ) %>% 
  mutate(variable = "Overall")

demo_breakdown <- 
  na_data %>% 
  group_by(degree_type) %>% 
  select(
    starts_with("so_"), 
    starts_with("gender_"), 
    starts_with("race_"), 
    -contains("description"), 
    -contains("explanation")
  ) %>% 
  summarize_all(
    .funs = function(x) sum(x == "yes")
  ) %>% 
  pivot_longer(
    cols = -degree_type,
    names_to = "variable", 
    values_to = "count"
  ) %>% 
  group_by(
    variable_type = 
      str_extract(variable, ".+_") %>% 
      str_sub(end = -2L)
  ) %>% 
  mutate(percentage = (count / sum(count) * 100) %>% round(1)) %>% 
  bind_rows(
    na_data %>% 
    select(
      starts_with("so_"), 
      starts_with("gender_"), 
      starts_with("race_"), 
      -contains("description"), 
      -contains("explanation")
    ) %>% 
      summarize_all(
        .funs = function(x) sum(x == "yes")
      ) %>% 
      pivot_longer(
        cols = everything(),
        names_to = "variable", 
        values_to = "count"
      ) %>% 
      group_by(
        variable_type = 
          str_extract(variable, ".+_") %>% 
          str_sub(end = -2L)
      ) %>% 
      mutate(
        percentage = (count / sum(count) * 100) %>% round(1), 
        degree_type = "Full Sample"
      )
  )

demo_year <- 
  na_data %>% 
  group_by(degree_type) %>% 
  select(med_school_year) %>% 
  count(med_school_year, name = "count") %>% 
  drop_na() %>% 
  ungroup() %>% 
  mutate(percentage = (count / sum(count) * 100) %>% round(1)) %>% 
  bind_rows(
    na_data %>%
      count(med_school_year, name = "count") %>% 
      drop_na() %>% 
      mutate(
        percentage = (count / sum(count) * 100) %>% round(1),
        degree_type = "Full Sample"
      )
  ) %>% 
  transmute(
    degree_type,
    variable = med_school_year, 
    variable_type = "med_school_year",
    count, 
    percentage
  )

demo_combined <- 
  bind_rows(demo_breakdown, demo_year) %>% 
  pivot_longer(
    cols = c(count, percentage), 
    names_to = "my_variable", 
    values_to = "my_value"
  ) %>% 
  pivot_wider(
    names_from = c(degree_type, my_variable), 
    values_from = my_value
  ) %>% 
  ungroup() %>% 
  mutate(
    variable_type = 
      factor(
        variable_type, 
        levels = c("gender", "so", "race", "med_school_year")
      )
  ) %>% 
  arrange(variable_type, desc(MD_count)) %>% 
  select(-variable_type)

Here we actually make the table…

recode_values <- 
  c(
    "gender_woman" = "Woman", 
    "gender_man" = "Man", 
    "gender_genderqueer" = "Genderqueer/Gender non-conforming", 
    "gender_transman" = "Transgender Man", 
    "gender_agender" = "Agender", 
    "gender_another" = "Another Gender Identity", 
    "gender_transwoman" = "Transgender Woman",
    "so_heterosexual" = "Straight/Heterosexual", 
    "so_gay" = "Gay", 
    "so_bisexual" = "Bisexual", 
    "so_queer" = "Queer", 
    "so_pansexual" = "Pansexual", 
    "so_lesbian" = "Lesbian", 
    "so_questioning" = "Questioning", 
    "so_asexual" = "Asexual",
    "so_another" = "Another Sexual Orientation", 
    "race_white" = "White/Caucasian", 
    "race_asian" = "Asian", 
    "race_hispanic" = "Latino or Hispanic",
    "race_black" = "Black or African American", 
    "race_another" = "Another Race", 
    "race_native" = "American Indian or Alaska Native", 
    "race_pi" = "Native Hawaiian or Other Pacific Islander"
  )

demo_overall %>% 
  bind_rows(demo_combined) %>%
  transmute(
    recode(variable, !!! recode_values),  
    `Full Sample` = str_glue("{`Full Sample_count`} ({`Full Sample_percentage`}%)"), 
    `Allopathic (MD)` = str_glue("{MD_count} ({MD_percentage}%)"), 
    `Osteopathic (DO)` = str_glue("{DO_count} ({DO_percentage}%)"), 
  ) %>% 
  kable(col.names = c(" ", colnames(.)[-1])) %>% 
  kable_styling(
    bootstrap_options = c("striped", "condensed"),
    full_width = F
  ) %>%
  row_spec(1, bold = T) %>% 
  pack_rows("Gender", 2, 8, label_row_css = "background-color: #666; color: #fff;") %>%
  pack_rows("Sexual Orientation", 9, 17, label_row_css = "background-color: #666; color: #fff;") %>% 
  pack_rows("Race/Ethnicity", 18, 24, label_row_css = "background-color: #666; color: #fff;") %>% 
  pack_rows("Year in School", 25, 28, label_row_css = "background-color: #666; color: #fff;") 
Full Sample Allopathic (MD) Osteopathic (DO)
Overall 1162 (100%) 1082 (93.1%) 80 (6.9%)
Gender
Woman 674 (55.6%) 617 (50.9%) 57 (4.7%)
Man 433 (35.7%) 411 (33.9%) 22 (1.8%)
Genderqueer/Gender non-conforming 72 (5.9%) 67 (5.5%) 5 (0.4%)
Transgender Man 13 (1.1%) 13 (1.1%) 0 (0%)
Agender 11 (0.9%) 11 (0.9%) 0 (0%)
Another Gender Identity 7 (0.6%) 6 (0.5%) 1 (0.1%)
Transgender Woman 3 (0.2%) 3 (0.2%) 0 (0%)
Sexual Orientation
Straight/Heterosexual 521 (36%) 498 (34.4%) 23 (1.6%)
Gay 239 (16.5%) 221 (15.3%) 18 (1.2%)
Bisexual 224 (15.5%) 208 (14.4%) 16 (1.1%)
Queer 215 (14.9%) 200 (13.8%) 15 (1%)
Pansexual 82 (5.7%) 72 (5%) 10 (0.7%)
Lesbian 82 (5.7%) 70 (4.8%) 12 (0.8%)
Questioning 37 (2.6%) 33 (2.3%) 4 (0.3%)
Asexual 34 (2.4%) 31 (2.1%) 3 (0.2%)
Another Sexual Orientation 12 (0.8%) 11 (0.8%) 1 (0.1%)
Race/Ethnicity
White/Caucasian 870 (67.3%) 812 (62.8%) 58 (4.5%)
Asian 184 (14.2%) 169 (13.1%) 15 (1.2%)
Latino or Hispanic 127 (9.8%) 113 (8.7%) 14 (1.1%)
Black or African American 71 (5.5%) 71 (5.5%) 0 (0%)
Another Race 18 (1.4%) 18 (1.4%) 0 (0%)
American Indian or Alaska Native 13 (1%) 12 (0.9%) 1 (0.1%)
Native Hawaiian or Other Pacific Islander 9 (0.7%) 4 (0.3%) 5 (0.4%)
Year in School
Pre-Clinical Student (prior to clerkships) 711 (61.2%) 663 (57.1%) 48 (4.1%)
Clinical Student (on clerkships) 374 (32.2%) 344 (29.6%) 30 (2.6%)
Research (PhD, Masters, or other) 50 (4.3%) 49 (4.2%) 1 (0.1%)
Other 26 (2.2%) 25 (2.2%) 1 (0.1%)

Outness plots

context_recode <- 
  c(
    "classmates_peers" = "To peers", 
    "labmates_coworkers_team" = "To lab-mates or team members", 
    "mentors" = "To mentors", 
    "medical_school_app" = "On application to medical school", 
    "residency_app" = "On application to residency/post-doctoral studies"
  )

n <- 
  na_data %>% 
  drop_na(school_attend, is_lgbtq) %>% 
  nrow()
  
outness_data <- 
  na_data %>% 
  select(
    starts_with("out_"), 
    starts_with("ability_out_"), 
    starts_with("protections_out"), 
    sex, 
    is_urm, 
    is_gm, 
    school_attend, 
    is_lgbtq
  ) %>% 
  select(
    -contains("explanation"), 
    -contains("other")
  ) %>% 
  drop_na(school_attend, is_lgbtq) %>% 
  pivot_longer(
    cols = 
      c(
        starts_with("out_"), 
        starts_with("ability_out_"), 
        starts_with("protections_out")
      ), 
    names_to = c("variable", "context"), 
    values_to = "is_out", 
    names_sep = "ut_"
  ) %>% 
  mutate(
    variable = 
      recode(
        variable, 
        "o" = "Out (or plan to be out)", 
        "ability_o" = "Support ability to be out", 
        "protections_o" = "Support protections for outness"
      ), 
    context = 
      recode(
        context, 
        !!! context_recode
      ) 
  )

outness_data %>% 
  group_by(is_lgbtq, variable, context) %>% 
  summarize(prop = sum(is_out == "yes") / n()) %>% 
  mutate(context = fct_reorder(context, desc(prop))) %>% 
  ggplot(aes(x = context, y = prop, fill = is_lgbtq)) + 
  geom_col(position = "dodge", color = "black") +
  geom_text(
    aes(label = (prop * 100) %>% round(0)), 
    vjust = -0.2, 
    size = 4, 
    position = position_dodge(width = 0.9), 
    fontface = "bold"
  ) + 
  facet_wrap(vars(variable), ncol = 3) + 
  scale_y_continuous(labels = scales::label_percent(accuracy = 1)) + 
  scale_fill_manual(
    breaks = c("LGBTQ+", "Non-LGBTQ+"), 
    values = c(mspa_hex, "gray60") 
  ) + 
  theme(
    legend.position = "bottom", 
    axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
    ) + 
  labs(
    x = NULL, 
    y = "% of Students", 
    fill = NULL, 
    caption = str_glue("N = {n}")
  )

Alter na_data to include a variable that we will use later

na_data <- 
  na_data %>% 
  mutate(
    gender_expansive = 
      if_else(
          gender_genderqueer == "yes" | 
          gender_agender == "yes" | 
          gender_transman == "yes" | 
          gender_transwoman == "yes" | 
          gender_another == "yes",
        "yes", 
        "no"
      ), 
    gender_nonexpansive = 
      if_else(
        gender_expansive == "no", 
        "yes", 
        "no"
      )
  )
so_vars <- 
  colnames(na_data)[
    setdiff(
      starts_with(match = "so_", vars = colnames(na_data)), 
      c(
        ends_with(match = "_description", vars = colnames(na_data)), 
        ends_with(match = "_explanation", vars = colnames(na_data))
      )
    )
  ]

demographic_vars <- 
  c("so_", "gender_", "race_") %>% 
  map(
    ~ setdiff(
      starts_with(match = ., vars = colnames(na_data)), 
      ends_with(match = "_description", vars = colnames(na_data))
    )
  ) %>% 
  unlist() %>% 
  colnames(na_data)[.]
      

tally_outness <- function(my_var) {
  na_data %>% 
    filter(is_lgbtq == "LGBTQ+") %>% 
    filter(!! sym(my_var) == "yes") %>% 
    select(starts_with("out_")) %>%
    select(-ends_with("_explanation"), -ends_with("_other")) %>% 
    summarize_all(
      #list(
        #count = function(x) sum(x == "yes"),
        #percentage = 
      function(x) sum(x == "yes") / length(x) * 100
      #)
    ) %>% 
    pivot_longer(
      cols = everything(), 
      names_to = "environment", 
      values_to = "percent_out", 
      names_prefix = "out_"
    )
}

outness_subgroup <- 
  tibble(identity = demographic_vars) %>% 
  mutate(
    data = map(identity, .f = tally_outness)
  ) %>% 
  unnest() %>% 
  separate(col = identity, into = c("var_type", "identity"), sep = "_")

# calculate total number of people in each demographic subgroup
demographics_n <- 
  na_data %>% 
  filter(is_lgbtq == "LGBTQ+") %>% 
  summarize_at(
    .vars = vars(starts_with("so_"), starts_with("gender_"), starts_with("race")), 
    ~ sum(. == "yes")
  ) %>% 
  select(-contains("another")) %>% 
  pivot_longer(
    cols = everything(), 
    names_to = "identity", 
    values_to = "n", 
  ) %>% 
  separate(col = identity, into = c("var_type", "identity"))

demographics_n
## # A tibble: 22 x 3
##    var_type identity         n
##    <chr>    <chr>        <int>
##  1 so       asexual         26
##  2 so       bisexual       213
##  3 so       gay            237
##  4 so       lesbian         81
##  5 so       pansexual       78
##  6 so       queer          214
##  7 so       questioning     13
##  8 so       heterosexual     7
##  9 gender   man            272
## 10 gender   woman          304
## # … with 12 more rows
so_labels <- 
  function(label) {
    new_label <-
      str_glue(
        "{title} (n = {number})", 
        title = str_to_title(label), 
        number = 
          demographics_n %>% 
          filter(identity %in% label) %>% 
          pull(n)
      )
    new_label
  }

overall_outness_data <- 
  outness_data %>% 
  filter(is_lgbtq == "LGBTQ+") %>% 
  group_by(variable, context) %>%
  summarize(prop = sum(is_out == "yes") / n()) %>%
  mutate(
    environment = fct_reorder(context, desc(prop)), 
    percent_out = (prop * 100) %>% round(1), 
    identity = "overall"
  )

overall_outness_data
## # A tibble: 15 x 6
## # Groups:   variable [3]
##    variable      context           prop environment         percent_out identity
##    <chr>         <chr>            <dbl> <fct>                     <dbl> <chr>   
##  1 Out (or plan… On application … 0.563 On application to …        56.3 overall 
##  2 Out (or plan… On application … 0.289 On application to …        28.9 overall 
##  3 Out (or plan… To lab-mates or… 0.591 To lab-mates or te…        59.1 overall 
##  4 Out (or plan… To mentors       0.474 To mentors                 47.4 overall 
##  5 Out (or plan… To peers         0.908 To peers                   90.8 overall 
##  6 Support abil… On application … 0.917 On application to …        91.7 overall 
##  7 Support abil… On application … 0.914 On application to …        91.4 overall 
##  8 Support abil… To lab-mates or… 0.949 To lab-mates or te…        94.9 overall 
##  9 Support abil… To mentors       0.946 To mentors                 94.6 overall 
## 10 Support abil… To peers         0.973 To peers                   97.3 overall 
## 11 Support prot… On application … 0.943 On application to …        94.3 overall 
## 12 Support prot… On application … 0.927 On application to …        92.7 overall 
## 13 Support prot… To lab-mates or… 0.941 To lab-mates or te…        94.1 overall 
## 14 Support prot… To mentors       0.959 To mentors                 95.9 overall 
## 15 Support prot… To peers         0.959 To peers                   95.9 overall
# SO plot
so_plot_frame <- 
  outness_subgroup %>% 
  filter(
    var_type == "so", 
    !(identity %in% c("heterosexual", "questioning", "asexual", "another"))
  ) %>% 
  mutate(
    environment = 
      recode(environment, !!! context_recode) %>% 
      fct_reorder(percent_out, .fun = mean, .desc = TRUE), 
    identity = fct_reorder2(identity, environment, percent_out)
  )
so_plot <- 
  ggplot() + 
  geom_line(
    data = so_plot_frame, 
    mapping = 
      aes(
        x = environment, 
        y = percent_out, 
        color = identity, 
        group = identity
      ), 
    size = 1
  ) +
  geom_point(
    data = so_plot_frame, 
    mapping = 
      aes(
        x = environment, 
        y = percent_out, 
        fill = identity 
      ), 
    shape = 21, 
    size = 3
  ) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  scale_y_continuous(
    limits = c(15, 105), 
    labels = scales::label_percent(accuracy = 1, scale = 1)
  ) +   
  scale_fill_tableau(labels = so_labels) + 
  scale_color_tableau(labels = so_labels) + 
  theme_bw() + 
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1)
  ) + 
  labs(
    subtitle = "Outness by sexual orientation", 
    x = NULL, 
    y = "Percentage of \"out\" students", 
    fill = NULL, 
    color = NULL
  )

so_plot %>% 
  ggsave(
    filename = "so_plot.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 6.5, 
    height = 5,
    units = "in"
  )

so_plot

Note that in this plot we only show identities that are >5% of the total number of responses.

so_plot_gray <- 
  ggplot() + 
  geom_line(
    data = so_plot_frame, 
    mapping = 
      aes(
        x = environment, 
        y = percent_out, 
        group = identity, 
        linetype = identity
      ), 
    color = "gray30",
    size = 1
  ) +
  geom_point(
    data = so_plot_frame, 
    mapping = 
      aes(
        x = environment, 
        y = percent_out, 
        fill = identity, 
        shape = identity
      ), 
    fill = "black",
    color = "gray60",
    size = 4
  ) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  scale_y_continuous(
    limits = c(15, 105), 
    labels = scales::label_percent(accuracy = 1, scale = 1)
  ) + 
  scale_shape_manual(
    values = 21:25,
    labels = so_labels
  ) + 
  scale_linetype_discrete(labels = so_labels, guide = FALSE) + 
  theme_bw() + 
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1)
  ) + 
  labs(
    subtitle = "Outness by sexual orientation", 
    x = NULL, 
    y = "Percentage of \"out\" students", 
    shape = NULL, 
    linetype = NULL
  )

so_plot_gray

so_plot_gray %>% 
  ggsave(
    filename = "so_plot_gray.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 6.5, 
    height = 5,
    units = "in"
    )

By gender identity…

gender_plot_frame <-
  outness_subgroup %>% 
  filter(identity %in% c("expansive", "nonexpansive")) %>% 
  mutate(
    environment = 
      recode(environment, !!! context_recode) %>% 
      factor(levels = levels(so_plot_frame$environment)), 
    identity = 
      identity %>% 
      recode(expansive = "Gender-expansive/Gender-diverse", nonexpansive = "Cisgender") %>% 
      str_wrap(width = 17) %>% 
      fct_reorder2(environment, percent_out)
  ) 

gender_plot <- 
  gender_plot_frame %>% 
  ggplot(aes(x = environment, y = percent_out, fill = identity)) +
  geom_line(aes(color = identity, group = identity), alpha = 0.8, size = 1) + 
  geom_point(shape = 21, size = 3, alpha = 1) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  scale_y_continuous(
    limits = c(15, 105), 
    labels = scales::label_percent(accuracy = 1, scale = 1), 
  ) + 
  scale_fill_tableau(
    labels = 
      function(x) {
        str_glue(
          "{x} (n = {num})", 
          num = 
            demographics_n %>% 
            mutate(
              identity = 
                identity %>% 
                recode(
                  expansive = "Gender-expansive/Gender-diverse", 
                  nonexpansive = "Cisgender"
                ) %>% 
                str_wrap(width = 17) 
            ) %>% 
            filter(identity %in% x) %>% 
            pull(n)
        )
      }
  ) + 
  scale_color_tableau(
    labels = 
      function(x) {
        str_glue(
          "{x} (n = {num})", 
          num = 
            demographics_n %>% 
            mutate(
              identity = 
                identity %>% 
                recode(
                  expansive = "Gender-expansive/Gender-diverse", 
                  nonexpansive = "Cisgender"
                ) %>% 
                str_wrap(width = 17) 
            ) %>% 
            filter(identity %in% x) %>% 
            pull(n)
        )
      }
  ) + 
  theme_bw() + 
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1)
  ) + 
  labs(
    subtitle = "Outness by gender identity", 
    x = NULL, 
    y = "Percentage of \"out\" students", 
    fill = NULL, 
    color = NULL
  ) 

gender_plot

gender_plot %>% 
  ggsave(
    filename = "gender_plot.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 6.5, 
    height = 5,
    units = "in"
  )
gender_labels <- 
  function(x) {
    str_glue(
      "{x} (n = {num})", 
      num = 
        demographics_n %>% 
        mutate(
          identity = 
            identity %>% 
            recode(
              expansive = "Gender-expansive/Gender-diverse", 
              nonexpansive = "Cisgender"
            ) %>% 
            str_wrap(width = 17) 
        ) %>% 
        filter(identity %in% x) %>% 
        pull(n)
    )
  }

gender_plot_gray <- 
  gender_plot_frame %>% 
  ggplot(aes(x = environment, y = percent_out)) +
  geom_line(aes(color = identity, group = identity), alpha = 0.8, size = 1) + 
  geom_point(
    aes(fill = identity), 
    color = "black", 
    shape = 21, 
    size = 3, 
    alpha = 1
  ) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  scale_y_continuous(
    limits = c(15, 105), 
    labels = scales::label_percent(accuracy = 1, scale = 1)
  ) + 
  scale_shape_manual(values = 21:22, labels = gender_labels) + 
  scale_color_manual(values = c("black", "gray"), labels = gender_labels) + 
  scale_fill_manual(values = c("black", "gray"), labels = gender_labels) + 
  theme_bw() + 
  theme(
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1)
  ) + 
  labs(
    subtitle = "Outness by gender identity", 
    x = NULL, 
    y = "Percentage of \"out\" students", 
    fill = NULL, 
    color = NULL
  ) 

gender_plot_gray

gender_plot_gray %>% 
  ggsave(
    filename = "gender_plot_gray.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 6.5, 
    height = 5,
    units = "in"
  )

Support for outness plots:

context_order <- 
  outness_data %>% 
  filter(variable == "Out (or plan to be out)") %>% 
  group_by(context) %>% 
  summarize(prop = sum(is_out == "yes") / n()) %>% 
  arrange(desc(prop)) %>% 
  pull(context)
  

support_plot <- 
  outness_data %>% 
  filter(!(variable == "Out (or plan to be out)" & is_lgbtq != "LGBTQ+")) %>% 
  group_by(variable, context) %>%
  summarize(prop = sum(is_out == "yes") / n()) %>%
  mutate(
    environment = factor(context, levels = context_order), 
    percent_out = (prop * 100) %>% round(1), 
    identity = "overall"
  ) %>% 
  ggplot(aes(x = environment, y = percent_out)) + 
  geom_col(
    aes(fill = variable), 
    position = "dodge", 
    color = "black", 
    size = 0.6
  ) +
  geom_text(
    aes(
      group = variable, 
      label = str_glue("{num}%", num = percent_out %>% round(0))
    ), 
    size = 4, 
    position = position_dodge(width = 0.91), 
    fontface = "bold", 
    vjust = -0.3
  ) + 
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) + 
  scale_y_continuous(
    labels = scales::label_percent(accuracy = 1, scale = 1), 
    limits = c(NA, 101)
  ) + 
  scale_fill_tableau() + 
  theme_bw() + 
  theme(
    legend.position = "bottom", 
    axis.text.x = element_text(size = 10, angle = 30, hjust = 1),
  ) + 
  labs(
    x = NULL, 
    y = "% of students", 
    fill = NULL,
    caption = str_glue("N = {nrow(na_data)}")
  )

support_plot

support_plot %>% 
  ggsave(
    filename = "support_plot.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 7.25, 
    height = 5,
    units = "in"
  )
support_plot_gray <- 
  support_plot + 
  scale_fill_tableau(palette = "Classic Gray 5")

support_plot_gray

support_plot_gray %>% 
  ggsave(
    filename = "support_plot_gray.pdf", 
    plot = ., 
    device = "pdf", 
    path = figure_out_path, 
    width = 7.25, 
    height = 5,
    units = "in"
  )

Alluvium Plot

alluvium_plot <- 
  na_data %>% 
  filter(is_lgbtq == "LGBTQ+") %>% 
  count(
    out_classmates_peers, 
    out_medical_school_app, 
    out_residency_app, 
    out_mentors, 
    out_labmates_coworkers_team
  ) %>% 
  ggplot(
    aes(
      y = n, 
      axis1 = out_medical_school_app, 
      axis2 = out_classmates_peers,
      axis3 = out_labmates_coworkers_team,
      axis4 = out_mentors,
      axis5 = out_residency_app
    )
  ) + 
  geom_alluvium(aes(fill = out_medical_school_app), alpha = 0.65) + 
  geom_stratum(width = 1/3, fill = "white", color = "black") + 
  geom_text(stat = "stratum", infer.label = TRUE, size = 3) + 
  scale_x_discrete(
    limits = 
      c(
        "out_medical_school_app",
        "out_classmates_peers",
        "out_labmates_coworkers_team", 
        "out_mentors", 
        "out_residency_app"
      ), 
    labels = 
      c(
        "Out on medical school application?", 
        "Out to peers?", 
        "Out to lab-mates/team-members?",
        "Out to mentors?", 
        "Out on residency application?"
      ) %>% 
      str_wrap(width = 20),
    expand = c(0.1, 0.1)
  ) + 
  scale_y_continuous(
    labels = scales::label_number()
  ) + 
  scale_fill_tableau() + 
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(
    subtitle = "Over time, medical students who were \"out\" on their applications to\nmedical school become less professionally \"out\"", 
    x = NULL, 
    y = "Number of students", 
    fill = NULL,
    caption = str_glue("N = {(na_data$is_lgbtq == \"LGBTQ+\") %>% sum(na.rm = TRUE)}")
  )

alluvium_plot

alluvium_plot %>% 
  ggsave(
    filename = "outness_alluvium.pdf", 
    plot = ., 
    device = "pdf", 
    path = file.path(figure_out_path), 
    width = 6, 
    height = 4.5, 
    units = "in"
  )

Significance testing

Outness at different stages of training

contingency_table <- 
  table(
    outness_data %>% 
      filter(is_lgbtq == "LGBTQ+", variable == "Out (or plan to be out)") %>% 
      pull(context), 
    outness_data %>% 
      filter(is_lgbtq == "LGBTQ+", variable == "Out (or plan to be out)") %>% 
      pull(is_out)
  )

# chi-squared test for independence
contingency_table %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 518.29, df = 4, p-value < 2.2e-16
p_values <- 
  matrix(nrow = nrow(contingency_table), ncol = nrow(contingency_table))

chi_squared_values <- 
    matrix(nrow = nrow(contingency_table), ncol = nrow(contingency_table))
  

row.names(p_values) <- row.names(contingency_table)
colnames(p_values) <- row.names(contingency_table)

row.names(chi_squared_values) <- row.names(contingency_table)
colnames(chi_squared_values) <- row.names(contingency_table)


for (i in 1:5) {
  for (j in 1:5) {
    contingency_table[c(i,j),] %>%
      chisq.test() %>%
      `$`(p.value) ->
      p_values[i,j]
    
    contingency_table[c(i,j),] %>%
      chisq.test() %>%
      `$`(statistic) ->
      chi_squared_values[i,j]
  }
}

#bonferroni correction
p_values <- p_values * 10

p_values %>% 
  knitr::kable() %>% 
  kable_styling()
On application to medical school On application to residency/post-doctoral studies To lab-mates or team members To mentors To peers
On application to medical school 10.0000000 0 3.3194253 0.0191287 0
On application to residency/post-doctoral studies 0.0000000 10 0.0000000 0.0000000 0
To lab-mates or team members 3.3194253 0 10.0000000 0.0003705 0
To mentors 0.0191287 0 0.0003705 10.0000000 0
To peers 0.0000000 0 0.0000000 0.0000000 10
chi_squared_values %>% 
  knitr::kable() %>% 
  kable_styling()
On application to medical school On application to residency/post-doctoral studies To lab-mates or team members To mentors To peers
On application to medical school 0.0000000 95.0541 0.9413047 9.63132 190.5470
On application to residency/post-doctoral studies 95.0541045 0.0000 115.2183962 44.55080 498.0326
To lab-mates or team members 0.9413047 115.2184 0.0000000 17.01666 166.0308
To mentors 9.6313197 44.5508 17.0166565 0.00000 275.3272
To peers 190.5470270 498.0326 166.0308438 275.32717 0.0000

Outness vs. support for outness in different stages of medical training

#to peers 
table(
  outness_data %>% 
    filter(context == "To peers") %>% 
    pull(variable), 
  outness_data %>% 
    filter(context == "To peers") %>% 
    pull(is_out)
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 1021.2, df = 2, p-value < 2.2e-16
#to labmates 
table(
  outness_data %>% 
    filter(context == "To lab-mates or team members") %>% 
    pull(variable), 
  outness_data %>% 
    filter(context == "To lab-mates or team members") %>% 
    pull(is_out)
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 1468.6, df = 2, p-value < 2.2e-16
# MD application 
table(
  outness_data %>% 
    filter(context == "On application to medical school") %>% 
    pull(variable), 
  outness_data %>% 
    filter(context == "On application to medical school") %>% 
    pull(is_out)
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 1223.6, df = 2, p-value < 2.2e-16
#to mentors 
table(
  outness_data %>% 
    filter(context == "To mentors") %>% 
    pull(variable), 
  outness_data %>% 
    filter(context == "To mentors") %>% 
    pull(is_out)
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 1664.9, df = 2, p-value < 2.2e-16
#to GME 
table(
  outness_data %>% 
    filter(context == "On application to residency/post-doctoral studies") %>% 
    pull(variable), 
  outness_data %>% 
    filter(context == "On application to residency/post-doctoral studies") %>% 
    pull(is_out)
) %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 1745.3, df = 2, p-value < 2.2e-16